TL;DR

Applying our zero-inflated model to the cortical data (L5 only).

Unnormalized data

We select only the cells that pass QC and that were clustered (no -1 labels), color coded by the cluster labels. I will focus on the top 1,000 most variable genes. Note that the data are not public, hence it will work only on Davide’s system (for now).

data("cortical")
labels <- read.table("~/git/brainData/analysis/results_160705/cluster_labels.txt", stringsAsFactors = FALSE, sep='\t', comment.char = "%")

l5_data <- cortical[, rownames(labels[labels$ClusterMergedLabel!="-1",])]

filter <- apply(assay(l5_data)>10, 1, sum)>=10

Number of retained genes:

print(sum(filter))
## [1] 8036

To speed up the computations, I will focus on the top 1,000 most variable genes.

l5_data <- l5_data[filter,]
core <- assay(l5_data)

vars <- rowVars(log1p(core))
names(vars) <- rownames(core)
vars <- sort(vars, decreasing = TRUE)
core <- core[names(vars)[1:1000],]

First, let’s look at PCA (of the log counts) for reference.

colMerged <- labels[labels$ClusterMergedLabel!="-1", "ClusterMergedColor"]

detection_rate <- colSums(core>0)
coverage <- colSums(core)

pca <- prcomp(t(log1p(core)))
plot(pca$x, col=colMerged, pch=19, main="PCA of log-counts, centered not scaled")

df <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)

print(cor(df, method="spearman"))
##                       PC1          PC2 detection_rate   coverage
## PC1             1.0000000  0.156529233   -0.797149452 -0.8500634
## PC2             0.1565292  1.000000000   -0.001071276 -0.2898612
## detection_rate -0.7971495 -0.001071276    1.000000000  0.4682109
## coverage       -0.8500634 -0.289861237    0.468210942  1.0000000

Now, let’s see how ZINB works.

print(system.time(zinb_Vall <- zinbFit(core, ncores = 3, K = 2)))
##    user  system elapsed 
## 123.079  26.572  65.975
plot(zinb_Vall@W, col=colMerged, pch=19, xlab="W1", ylab="W2", main="ZINB")

#total number of detected genes in the cell
df <- data.frame(W1=zinb_Vall@W[,1], W2=zinb_Vall@W[,2], gamma_mu = zinb_Vall@gamma_mu[1,], gamma_pi = zinb_Vall@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)

print(cor(df, method="spearman"))
##                        W1          W2   gamma_mu    gamma_pi
## W1              1.0000000 -0.37088778  0.5808957 -0.68002463
## W2             -0.3708878  1.00000000 -0.2975745  0.01314882
## gamma_mu        0.5808957 -0.29757451  1.0000000 -0.28499939
## gamma_pi       -0.6800246  0.01314882 -0.2849994  1.00000000
## detection_rate  0.7381200 -0.05866148  0.4529805 -0.97174374
## coverage        0.5464538 -0.38472669  0.9552966 -0.32764896
##                detection_rate   coverage
## W1                 0.73811996  0.5464538
## W2                -0.05866148 -0.3847267
## gamma_mu           0.45298050  0.9552966
## gamma_pi          -0.97174374 -0.3276490
## detection_rate     1.00000000  0.4682109
## coverage           0.46821094  1.0000000

Accounting for batch effects

batch <- droplevels(colData(l5_data)$MD_c1_run_id)
mod <- model.matrix(~batch)

boxplot(df$W1~batch)

boxplot(df$W2~batch)

print(system.time(zinb_X <- zinbFit(core, ncores = 3, K = 2, X=mod)))
##    user  system elapsed 
## 322.892  56.209 140.675
plot(zinb_X@W, col=colMerged, pch=19, xlab="W1", ylab="W2", main="ZINB")

#total number of detected genes in the cell
df <- data.frame(W1=zinb_X@W[,1], W2=zinb_X@W[,2], gamma_mu = zinb_X@gamma_mu[1,], gamma_pi = zinb_X@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)

print(cor(df, method="spearman"))
##                        W1         W2   gamma_mu   gamma_pi detection_rate
## W1              1.0000000 -0.4285219  0.5895325 -0.5957512      0.6873644
## W2             -0.4285219  1.0000000 -0.4851126  0.1285883     -0.1901410
## gamma_mu        0.5895325 -0.4851126  1.0000000 -0.3227129      0.4838874
## gamma_pi       -0.5957512  0.1285883 -0.3227129  1.0000000     -0.9544406
## detection_rate  0.6873644 -0.1901410  0.4838874 -0.9544406      1.0000000
## coverage        0.5129822 -0.5692522  0.9523981 -0.3437921      0.4682109
##                  coverage
## W1              0.5129822
## W2             -0.5692522
## gamma_mu        0.9523981
## gamma_pi       -0.3437921
## detection_rate  0.4682109
## coverage        1.0000000
boxplot(df$W1~batch)

boxplot(df$W2~batch)

Accounting for quality

qc <- as.matrix(colData(l5_data)[,1:14])
qcpca <- prcomp(qc, center=TRUE, scale.=TRUE)
mod <- model.matrix(~batch + qcpca$x[,1:2])

plot(df$W1~qcpca$x[,1], pch=19, col=colMerged)

plot(df$W2~qcpca$x[,1], pch=19, col=colMerged)

print(system.time(zinb_q <- zinbFit(core, ncores = 3, K = 2, X=mod)))
##    user  system elapsed 
## 372.457  61.163 163.809
plot(zinb_q@W, col=colMerged, pch=19, xlab="W1", ylab="W2", main="ZINB")

#total number of detected genes in the cell
df <- data.frame(W1=zinb_q@W[,1], W2=zinb_q@W[,2], gamma_mu = zinb_q@gamma_mu[1,], gamma_pi = zinb_q@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)

print(cor(df, method="spearman"))
##                        W1           W2     gamma_mu   gamma_pi
## W1              1.0000000  0.182434988  0.194849547 -0.3957306
## W2              0.1824350  1.000000000  0.003560832 -0.1673526
## gamma_mu        0.1948495  0.003560832  1.000000000 -0.3569265
## gamma_pi       -0.3957306 -0.167352629 -0.356926451  1.0000000
## detection_rate  0.4379676  0.180685646  0.502168620 -0.9535241
## coverage        0.1098789  0.005351177  0.953320973 -0.3530355
##                detection_rate     coverage
## W1                  0.4379676  0.109878881
## W2                  0.1806856  0.005351177
## gamma_mu            0.5021686  0.953320973
## gamma_pi           -0.9535241 -0.353035509
## detection_rate      1.0000000  0.468210942
## coverage            0.4682109  1.000000000
boxplot(df$W1~batch)

boxplot(df$W2~batch)

Three dimensions

print(system.time(zinb_3 <- zinbFit(core, ncores = 3, K = 3, X=mod)))
##    user  system elapsed 
## 401.602  66.262 173.422
pairs(zinb_3@W, col=colMerged, pch=19, main="ZINB")

#total number of detected genes in the cell
df <- data.frame(W1=zinb_3@W[,1], W2=zinb_3@W[,2], W3=zinb_3@W[,3], gamma_mu = zinb_3@gamma_mu[1,], gamma_pi = zinb_3@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)

print(cor(df, method="spearman"))
##                         W1          W2           W3     gamma_mu
## W1              1.00000000 -0.33654363 -0.019749590 -0.217045507
## W2             -0.33654363  1.00000000 -0.013168732  0.045486125
## W3             -0.01974959 -0.01316873  1.000000000  0.003234138
## gamma_mu       -0.21704551  0.04548612  0.003234138  1.000000000
## gamma_pi        0.47335254 -0.17336272  0.108652182 -0.355591805
## detection_rate -0.51908169  0.19423094 -0.108389528  0.509108822
## coverage       -0.12447772  0.06739511  0.013016634  0.952445360
##                  gamma_pi detection_rate    coverage
## W1              0.4733525     -0.5190817 -0.12447772
## W2             -0.1733627      0.1942309  0.06739511
## W3              0.1086522     -0.1083895  0.01301663
## gamma_mu       -0.3555918      0.5091088  0.95244536
## gamma_pi        1.0000000     -0.9497425 -0.34834942
## detection_rate -0.9497425      1.0000000  0.46821094
## coverage       -0.3483494      0.4682109  1.00000000

Removing glia

noglia_data <- cortical[,rownames(labels[!(labels$ClusterMergedLabel %in% c("-1", "m4", "m5")),])]
filter <- apply(assay(noglia_data)>10, 1, sum)>=10

noglia <- assay(noglia_data[filter,])
colNoglia <- labels[!(labels$ClusterMergedLabel %in% c("-1", "m4", "m5")), "ClusterMergedColor"]

vars <- rowVars(log1p(noglia))
names(vars) <- rownames(noglia)
vars <- sort(vars, decreasing = TRUE)
noglia <- noglia[names(vars)[1:1000],]

batch <- droplevels(colData(noglia_data)$MD_c1_run_id)
qc <- as.matrix(colData(noglia_data)[,1:14])
qcpca <- prcomp(qc, center=TRUE, scale.=TRUE)
mod <- model.matrix(~batch + qcpca$x[,1:2])
print(system.time(zinb_noglia <- zinbFit(noglia, ncores = 3, K = 3, X=mod)))
##    user  system elapsed 
## 402.567  33.575 169.758
pairs(zinb_noglia@W, col=colNoglia, pch=19, main="ZINB")

Removing “Pink cells”

nopink_data <- cortical[,rownames(labels[!(labels$ClusterMergedLabel %in% c("-1", "m2", "m4", "m5", "m6")),])]
filter <- apply(assay(nopink_data)>10, 1, sum)>=10

nopink <- assay(nopink_data[filter,])
colNopink <- labels[!(labels$ClusterMergedLabel %in% c("-1", "m2", "m4", "m5", "m6")), "ClusterMergedColor"]

vars <- rowVars(log1p(nopink))
names(vars) <- rownames(nopink)
vars <- sort(vars, decreasing = TRUE)
nopink <- nopink[names(vars)[1:1000],]

batch <- droplevels(colData(nopink_data)$MD_c1_run_id)
qc <- as.matrix(colData(nopink_data)[,1:14])
qcpca <- prcomp(qc, center=TRUE, scale.=TRUE)
mod <- model.matrix(~batch + qcpca$x[,1:2])
print(system.time(zinb_nopink <- zinbFit(nopink, ncores = 3, K = 3, X=mod)))
##    user  system elapsed 
## 370.877  55.012 159.651
pairs(zinb_nopink@W, col=colNopink, pch=19, main="ZINB")

Using all genes

core <- assay(l5_data)
batch <- droplevels(colData(l5_data)$MD_c1_run_id)
qc <- as.matrix(colData(l5_data)[,1:14])
qcpca <- prcomp(qc, center=TRUE, scale.=TRUE)
mod <- model.matrix(~batch + qcpca$x[,1:2])

print(system.time(zinb_all <- zinbFit(core, ncores = 3, K = 3, X=mod)))
## Warning in optimize(f = zinb.loglik.dispersion, Y = Y, mu = mu, logitPi =
## logitPi, : NA/Inf replaced by maximum positive value

## Warning in optimize(f = zinb.loglik.dispersion, Y = Y, mu = mu, logitPi =
## logitPi, : NA/Inf replaced by maximum positive value

## Warning in optimize(f = zinb.loglik.dispersion, Y = Y, mu = mu, logitPi =
## logitPi, : NA/Inf replaced by maximum positive value
##     user   system  elapsed 
## 3408.537  501.766 1541.282
pairs(zinb_all@W, col=colMerged, pch=19, main="ZINB")